rm(list = ls())
load("senate.RData")

save(list = ls(all = TRUE), file= "senate.RData")

install.packages("")

library(scales)
library(readxl)
library(sf)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(units)
library(MASS)
library(ggeffects)
library(tidyr)
library(stringr)
library(purrr)
library(stargazer)
library(car)
library(patchwork)

##################################
########## LOAD DATA #############
##################################

load("Czech_senate_campaign_finance_dataset.RData")

serk24 <- read_excel("data/serk24.xlsx")
serk22 <- read_excel("data/serk22.xlsx")
serk20 <- read_excel("data/serk20.xlsx")
serk18 <- read_excel("data/serk18.xlsx")

cvs24 <- read_excel("data/SE2024ciselniky20240928_xlsx/cvs.xlsx")
cvs22 <- read_excel("data/SE2022ciselniky20220916/cvs.xlsx")
cvs20 <- read_excel("data/SE2020ciselnik20200918/cvs.xlsx")
cvs18 <- read_excel("data/SE2018ciselnik20181004/cvs.xlsx")

serk24 <- merge(serk24,cvs24[,c("VSTRANA","ZKRATKAV30","ZKRATKAV8")])
serk22 <- merge(serk22,cvs22[,c("VSTRANA","ZKRATKAV30","ZKRATKAV8")])
serk20 <- merge(serk20,cvs20[,c("VSTRANA","ZKRATKAV30","ZKRATKAV8")])
serk18 <- merge(serk18,cvs18[,c("VSTRANA","ZKRATKAV30","ZKRATKAV8")])

serk24$election <- 2024
serk22$election <- 2022
serk20$election <- 2020
serk18$election <- 2018

serk <- rbind(serk18,serk20,serk22,serk24)
serk <- serk %>% rename(candidateList = VSTRANA)
serk <- serk %>% rename(constituency = OBVOD)
serk <- serk[serk$PLATNOST=="A",] # exclude invalid candidates

map <- st_read("VO_Senat_2022g300/vo_senat_2022g300.shp", options = "ENCODING=WINDOWS-1250")
map$OBVOD <- as.numeric(map$OBVOD)
map <- map %>% rename(constituency = OBVOD)

set5_12 <- read_excel("data/set5/set5_2012.xlsx")
set5_14 <- read_excel("data/set5/set5_2014.xlsx")
set5_16 <- read_excel("data/set5/set5_2016.xlsx")
set5_18 <- read_excel("data/set5/set5_2018.xlsx")

table(serk18$VSTRANA)
table(serk20$VSTRANA)
table(serk22$VSTRANA)
table(serk24$VSTRANA)

###################################
########## BASIC DESCRIPTION ######
###################################

table(expenditures$election)
table(contributions$election)
table(donations$election)



#############################
########## ADJUST DATA ######
#############################

# past turnout
set5_12 <- set5_12 %>%
  filter(KOLO != 2) %>%
  group_by(OBVOD) %>%
  summarise(
    VOL_SEZNAM = sum(VOL_SEZNAM, na.rm = TRUE),
    VYD_OBALKY = sum(VYD_OBALKY, na.rm = TRUE)
  ) %>%
  mutate(
    TURNOUT = VYD_OBALKY / VOL_SEZNAM *100
  )
set5_14 <- set5_14 %>%
  filter(KOLO != 2) %>%
  group_by(OBVOD) %>%
  summarise(
    VOL_SEZNAM = sum(VOL_SEZNAM, na.rm = TRUE),
    VYD_OBALKY = sum(VYD_OBALKY, na.rm = TRUE)
  ) %>%
  mutate(
    TURNOUT = VYD_OBALKY / VOL_SEZNAM *100
  )
set5_16 <- set5_16 %>%
  filter(KOLO != 2) %>%
  group_by(OBVOD) %>%
  summarise(
    VOL_SEZNAM = sum(VOL_SEZNAM, na.rm = TRUE),
    VYD_OBALKY = sum(VYD_OBALKY, na.rm = TRUE)
  ) %>%
  mutate(
    TURNOUT = VYD_OBALKY / VOL_SEZNAM *100
  )
set5_18 <- set5_18 %>%
  filter(KOLO != 2) %>%
  group_by(OBVOD) %>%
  summarise(
    VOL_SEZNAM = sum(VOL_SEZNAM, na.rm = TRUE),
    VYD_OBALKY = sum(VYD_OBALKY, na.rm = TRUE)
  ) %>%
  mutate(
    TURNOUT = VYD_OBALKY / VOL_SEZNAM *100
  )
set5_12$election <- 2018
set5_14$election <- 2020
set5_16$election <- 2022
set5_18$election <- 2024
set5 <- rbind(set5_12,set5_14,set5_16,set5_18)
set5 <- set5 %>% rename(constituency = OBVOD)

# mark incumbents among candidates
serk <- serk %>%
  mutate(
    incumbent = grepl("senát", POVOLANI, ignore.case = TRUE) &
      !grepl("emeritní|bývalý|bývalá", POVOLANI, ignore.case = TRUE)
  )

# differentiate gender of candidates
serk <- serk %>%
  mutate(female = grepl("ová$", PRIJMENI, ignore.case = TRUE))
serk$female <- as.numeric(serk$female)

# identify university education of candidates
serk$university <- ifelse(is.na(serk$TITULPRED) & is.na(serk$TITULZA),0,1)

# candidates summary by elections and constituencies
candidates <- serk %>%
  group_by(election, constituency) %>%
  summarise(
    candidates = n(),
    incumbent = sum(incumbent, na.rm = TRUE),
    .groups = "drop"
  )

# margins in constituencies
margins <- serk %>%
  filter(!is.na(PROC_K1)) %>%
  group_by(election, constituency) %>%
  arrange(desc(PROC_K1), .by_group = TRUE) %>%
  summarise(
    first  = nth(PROC_K1, 1),
    second = nth(PROC_K1, 2), 
    third  = nth(PROC_K1, 3),
    margin_2nd_3rd = second - third,
    over_50 = first > 50,
    margin_1st_2nd = if (first > 50) {
      first - second
    } else {
      k2_first  = max(PROC_K2[!is.na(PROC_K2)], na.rm = TRUE)
      k2_second = sort(PROC_K2[!is.na(PROC_K2)], decreasing = TRUE)[2]
      k2_first - k2_second
    },
    .groups = "drop"
  )

# calculate distance from Prague
map <- map %>%
  mutate(
    distance = ifelse(grepl("Praha", SIDLO, ignore.case = TRUE), 0, NA_real_)
  )
praha_polygons <- map %>% filter(distance == 0)
other_polygons <- map %>% filter(is.na(distance))
other_centroids <- st_centroid(other_polygons)
praha_centroids <- st_centroid(praha_polygons)
dist_matrix <- st_distance(other_centroids, praha_centroids)
min_dist <- apply(dist_matrix, 1, min)
other_polygons <- other_polygons %>%
  mutate(distance = set_units(min_dist, "m") %>% set_units("km") %>% drop_units())
map <- bind_rows(praha_polygons, other_polygons)
rm(praha_centroids,praha_polygons,other_centroids,other_polygons)

# expenditures by candidates
exp_candidates <- expenditures %>%
  mutate(row_id = row_number()) %>%
  separate_rows(constituency, sep = ",") %>%
  mutate(constituency = str_trim(constituency)) %>%
  group_by(row_id) %>%
  mutate(n_constituencies = n()) %>%
  ungroup() %>%
  mutate(adjusted_price = usualPrice / n_constituencies) %>%
  group_by(constituency, election, candidateList) %>%
  summarise(
    total_expenditures = sum(adjusted_price, na.rm = TRUE),
    .groups = "drop"
  )
exp_candidates$constituency <- as.numeric(exp_candidates$constituency)

# contributions by candidates
con_candidates <- contributions %>%
  mutate(row_id = row_number()) %>%
  separate_rows(constituency, sep = ",") %>%
  mutate(constituency = str_trim(constituency)) %>%
  group_by(row_id) %>%
  mutate(n_constituencies = n()) %>%
  ungroup() %>%
  mutate(adjusted_price = value / n_constituencies) %>%
  group_by(constituency, election, candidateList) %>%
  summarise(
    total_contributions = sum(adjusted_price, na.rm = TRUE),
    .groups = "drop"
  )
con_candidates$constituency <- as.numeric(con_candidates$constituency)

# donations by candidates
don_candidates <- donations %>%
  mutate(row_id = row_number()) %>%
  separate_rows(constituency, sep = ",") %>%
  mutate(constituency = str_trim(constituency)) %>%
  group_by(row_id) %>%
  mutate(n_constituencies = n()) %>%
  ungroup() %>%
  mutate(adjusted_price = money / n_constituencies) %>%
  group_by(constituency, election, candidateList) %>%
  summarise(
    total_donations = sum(adjusted_price, na.rm = TRUE),
    .groups = "drop"
  )
don_candidates$constituency <- as.numeric(don_candidates$constituency)

sum(donations$money)
sum(don_candidates$total_donations)

sum(expenditures$usualPrice)
sum(exp_candidates$total_expenditures)

sum(contributions$value)
sum(con_candidates$total_contributions)

# spendings by candidates

spen_candidates <- merge(con_candidates,exp_candidates, by = c("constituency","election","candidateList"), all = T)
spen_candidates$total_spendings <- rowSums(spen_candidates[, c("total_contributions", "total_expenditures")], na.rm = TRUE)

#############################
########## DESCRIPTIVE ######
#############################

# spending over time
time_spend <- expenditures[,c("parties","election","partiesName","usualPrice")]
time_spend <- time_spend %>% rename(value = usualPrice)
time_spend <- rbind(time_spend,contributions[,c("parties","election","partiesName","value")])
time_spend_clean <- time_spend %>%
  mutate(
    codes = strsplit(parties, ","),
    names = strsplit(parties, ","),
    codes = map(codes, trimws),
    names = map(names, trimws)
  ) %>%
  rowwise() %>%
  mutate(
    n_parties = length(codes),
    split_value = value / n_parties
  ) %>%
  unnest(c(codes, names)) %>%
  ungroup()
time_spend_clean <- time_spend_clean %>%
  mutate(party_id = paste0(names))
time_spend_wide <- time_spend_clean %>%
  group_by(party_id, election) %>%
  summarise(total_value = sum(split_value), .groups = "drop") %>%
  pivot_wider(
    names_from = election,
    values_from = total_value,
    values_fill = 0
  )
time_spend_wide <- time_spend_wide[, intersect(c("party_id", "2018", "2019", "2020", "2021", "2022", "2023", "2024"), colnames(time_spend_wide))]
selected_parties <- c("768","001","047","053","720","007","1114","166","721","080")
subtitle_labels <- c("ANO","KDU-ČSL","KSČM","ODS","Pirates","SOCDEM","SPD","STAN","TOP 09","Independent")
time_spend_long <- time_spend_wide %>%
  filter(party_id %in% selected_parties) %>%
  pivot_longer(cols = -party_id, names_to = "year", values_to = "spending") %>%
  mutate(
    year = as.integer(year),
    spending = spending / 1e6,  # convert to millions
    label = factor(party_id, levels = selected_parties, labels = subtitle_labels)
  )
ggplot(time_spend_long, aes(x = year, y = spending)) +
  geom_line(color = "black") +
  geom_point(shape = 4, color = "black", size = 1.5) +
  facet_wrap(~ label, ncol = 3) +
  scale_y_continuous(labels = scales::label_number(suffix = "")) +
  labs(x = "Year", y = "Spendings (M)") +
  theme_minimal(base_size = 12) +
  theme(panel.grid.minor.x = element_blank())
ggsave("export/time_spend.jpeg", width = 10, height = 6, dpi = 300)

# donations over time
time_don <- donations[,c("parties","election","partiesName","money")]
time_don_clean <- time_don %>%
  mutate(
    codes = strsplit(parties, ","),
    names = strsplit(parties, ","),
    codes = map(codes, trimws),
    names = map(names, trimws)
  ) %>%
  rowwise() %>%
  mutate(
    n_parties = length(codes),
    split_value = money / n_parties
  ) %>%
  unnest(c(codes, names)) %>%
  ungroup()
time_don_clean <- time_don_clean %>%
  mutate(party_id = paste0(names))
time_don_wide <- time_don_clean %>%
  group_by(party_id, election) %>%
  summarise(total_value = sum(split_value), .groups = "drop") %>%
  pivot_wider(
    names_from = election,
    values_from = total_value,
    values_fill = 0
  )
time_don_wide <- time_don_wide[, intersect(c("party_id", "2018", "2019", "2020", "2021", "2022", "2023", "2024"), colnames(time_don_wide))]
selected_parties <- c("768","001","047","053","720","007","1114","166","721","080")
subtitle_labels <- c("ANO","KDU-ČSL","KSČM","ODS","Pirates","SOCDEM","SPD","STAN","TOP 09","Independent")
time_don_long <- time_don_wide %>%
  filter(party_id %in% selected_parties) %>%
  pivot_longer(cols = -party_id, names_to = "year", values_to = "spending") %>%
  mutate(
    year = as.integer(year),
    spending = spending / 1e6,  # convert to millions
    label = factor(party_id, levels = selected_parties, labels = subtitle_labels)
  )
ggplot(time_don_long, aes(x = year, y = spending)) +
  geom_line(color = "black") +
  geom_point(shape = 4, color = "black", size = 1.5) +
  facet_wrap(~ label, ncol = 3) +
  scale_y_continuous(labels = scales::label_number(suffix = "")) +
  labs(x = "Year", y = "Donations (M)") +
  theme_minimal(base_size = 12) +
  theme(panel.grid.minor.x = element_blank())
ggsave("export/time_don.jpeg", width = 10, height = 6, dpi = 300)

# expenditures across time
table(is.na(expenditures$expenseDate))

expenditures <- expenditures %>%
  mutate(
    expenseDate = as.Date(expenseDate),
    movementDate = as.Date(movementDate),
    effective_date = coalesce(expenseDate, movementDate),
    election_date = case_when(
      election == 2024 ~ as.Date("2024-09-20"),
      election == 2022 ~ as.Date("2022-09-23"),
      election == 2020 ~ as.Date("2020-10-02"),
      election == 2018 ~ as.Date("2018-10-05"),
      TRUE ~ NA_Date_
    ),
    days_relative_election = as.integer(effective_date - election_date),
    week_relative_election = floor(days_relative_election / 7)
  )

weekly_expenditures <- expenditures %>%
  group_by(election, week_relative_election) %>%
  summarise(total_expenditures = sum(usualPrice, na.rm = TRUE), .groups = "drop")
ggplot(weekly_expenditures, aes(x = week_relative_election, y = total_expenditures / 1e6)) +
  geom_col(fill = "black") +
  labs(
    x = "Weeks Before the Election",
    y = "Expenditures (M)",
    title = ""
  ) +
  theme_minimal(base_size = 12) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey30") +
  scale_x_continuous(limits = c(-30, 3), breaks = seq(-30, 3, 2)) +
  scale_y_continuous(labels = label_number(suffix = "", accuracy = 1)) +
  facet_wrap(~ election, ncol = 1)
ggsave("export/before_expend.jpeg", width = 10, height = 5, dpi = 300)

# donations across time
table(is.na(donations$date))
donations <- donations %>%
  mutate(
    election_date = case_when(
      election == 2024 ~ as.Date("2024-09-20"),
      election == 2022 ~ as.Date("2022-09-23"),
      election == 2020 ~ as.Date("2020-10-02"),
      election == 2018 ~ as.Date("2018-10-05"),
      TRUE ~ NA_Date_
    ),
    days_relative_election = as.numeric(as.Date(date) - election_date)
  )
donations_by_day <- donations %>%
  group_by(election, days_relative_election) %>%
  summarise(n_donations = n(), .groups = "drop")
ggplot(donations_by_day, aes(x = days_relative_election, y = n_donations)) +
  geom_line(size = 0.5, color = "black") +
  labs(
    x = "Days Before the Election",
    y = "Donations (N)"
  ) +
  theme_minimal(base_size = 12) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey30") +
  scale_x_continuous(limits = c(-200, 20), breaks = seq(-200, 20, 20)) +
  scale_y_continuous(limits = c(0, 150)) +
  facet_wrap(~ election, ncol = 1)
ggsave("export/before_donations.jpeg", width = 10, height = 5, dpi = 300)


#############################
########## MAPS #############
#############################

# map of spendings
map_spend <- expenditures[expenditures$election%in%c(2020:2024),c("parties","constituency","usualPrice")]
map_spend <- map_spend %>% rename(value = usualPrice)
map_spend <- rbind(map_spend,contributions[contributions$election%in%c(2020:2024),c("parties","constituency","value")])
map_spend <- map_spend %>%
  mutate(
    parties = strsplit(parties, ","),
    constituencies = strsplit(constituency, ",")
  ) %>%
  rowwise() %>%
  mutate(
    n_parties = length(parties),
    n_constituencies = length(constituencies),
    n_combinations = n_parties * n_constituencies,
    split_price = value / n_combinations
  ) %>%
  unnest(parties) %>%
  unnest(constituencies) %>%
  ungroup() %>%
  mutate(
    parties = trimws(parties),
    constituencies = trimws(constituencies)
  ) %>%
  group_by(constituencies, parties) %>%
  summarise(total_spending = sum(split_price), .groups = "drop") %>%
  pivot_wider(
    names_from = parties,
    values_from = total_spending,
    values_fill = 0
  )
map_spend$constituencies <- as.numeric(map_spend$constituencies)
map_spend <- map_spend %>%
  mutate(total_spending = rowSums(across(-constituencies))) %>%
  rename(constituency = constituencies) %>%
  dplyr::select(constituency, total_spending)
map_spend <- merge(map,map_spend,by="constituency")
map_spend <- merge(map_spend,candidates[candidates$election%in%c(2020:2024),],by=c("constituency"))
map_spend$spendings_can <- map_spend$total_spending/map_spend$candidates
quantile_breaks <- quantile(map_spend$spendings_can, probs = seq(0, 1, 0.25), na.rm = TRUE)
map_spend <- map_spend %>%
  mutate(
    spending_quantile = cut(
      spendings_can,
      breaks = quantile_breaks,
      include.lowest = TRUE,
      labels = paste0(
        "", round(quantile_breaks[-5] / 1e3, 0), "-", round(quantile_breaks[-1] / 1e3, 0), ""
      )
    )
  )
ggplot(map_spend) +
  geom_sf(aes(fill = spending_quantile), color = "white", size = 0.1) +
  scale_fill_manual(
    values = c("grey90", "grey60", "grey30", "black"),
    name = "Spendings per \nCandidate (in K)",
    labels = function(x) {
      sapply(x, function(lbl) {
        parts <- unlist(strsplit(lbl, "-"))
        paste0(
          formatC(as.numeric(parts[1]), format = "d", big.mark = ","),
          if (length(parts) == 2) paste0("-", formatC(as.numeric(parts[2]), format = "d", big.mark = ",")) else ""
        )
      })
    }
  ) +
  theme_void()
ggsave("export/map_spend.jpeg", width = 10, height = 5, dpi = 300)

# map of donations
map_don <- donations[donations$election%in%c(2020:2024),c("parties","constituency","money")]
map_don <- map_don %>%
  mutate(
    parties = strsplit(parties, ","),
    constituencies = strsplit(constituency, ",")
  ) %>%
  rowwise() %>%
  mutate(
    n_parties = length(parties),
    n_constituencies = length(constituencies),
    n_combinations = n_parties * n_constituencies,
    split_price = money / n_combinations
  ) %>%
  unnest(parties) %>%
  unnest(constituencies) %>%
  ungroup() %>%
  mutate(
    parties = trimws(parties),
    constituencies = trimws(constituencies)
  ) %>%
  group_by(constituencies, parties) %>%
  summarise(total_spending = sum(split_price), .groups = "drop") %>%
  pivot_wider(
    names_from = parties,
    values_from = total_spending,
    values_fill = 0
  )
map_don$constituencies <- as.numeric(map_don$constituencies)
map_don <- map_don %>%
  mutate(total_spending = rowSums(across(-constituencies))) %>%
  rename(constituency = constituencies) %>%
  dplyr::select(constituency, total_spending)
map_don <- merge(map,map_don,by="constituency")
map_don <- merge(map_don,candidates[candidates$election%in%c(2020:2024),],by=c("constituency"))
map_don$donations_can <- map_don$total_spending/map_don$candidates
quantile_breaks <- quantile(map_don$donations_can, probs = seq(0, 1, 0.25), na.rm = TRUE)
map_don <- map_don %>%
  mutate(
    donation_quantile = cut(
      donations_can,
      breaks = quantile_breaks,
      include.lowest = TRUE,
      labels = paste0(
        "", round(quantile_breaks[-5] / 1e3, 0), "-", round(quantile_breaks[-1] / 1e3, 0), ""
      )
    )
  )
ggplot(map_don) +
  geom_sf(aes(fill = donation_quantile), color = "white", size = 0.1) +
  scale_fill_manual(
    values = c("grey90", "grey60", "grey30", "black"),
    name = "Donations per \nCandidate (in K)"
  ) +
  theme_void()
ggsave("export/map_don.jpeg", width = 10, height = 5, dpi = 300)

#############################
########## ANALYSIS #########
#############################

# individual level analysis
serk <- merge(serk,exp_candidates,by = c("constituency","election","candidateList"),all.x = T)
serk$total_expenditure <- ifelse(is.na(serk$total_expenditure),0,serk$total_expenditure)
serk <- merge(serk,con_candidates,by = c("constituency","election","candidateList"),all.x = T)
serk$total_contributions <- ifelse(is.na(serk$total_contributions),0,serk$total_contributions)
serk <- merge(serk,don_candidates,by = c("constituency","election","candidateList"),all.x = T)
serk$total_donations <- ifelse(is.na(serk$total_donations),0,serk$total_donations)
serk$total_spendings <- serk$total_expenditure + serk$total_contributions
serk <- merge(serk,map[,c("constituency","distance")],by="constituency")
serk <- merge(serk,candidates, by = c("constituency","election"))
serk <- merge(serk,margins, by = c("constituency","election"))
unique_combinations <- expenditures %>% distinct(election, candidateList, partiesName) # add partiesName
serk <- merge(serk,unique_combinations[,c("election","candidateList","partiesName")],by = c("election","candidateList"), all.x = T)
serk$partiesName <- ifelse(is.na(serk$partiesName),serk$ZKRATKAV8,serk$partiesName)
serk$independent <- ifelse(serk$partiesName=="NK",1,0)
serk$parl_parties <- ifelse(serk$election==2018 & grepl("ANO|ODS|STAN|KSČM|Piráti|KDU-ČSL|TOP 09|SPD|ČSSD", serk$partiesName),1,0)
serk$parl_parties <- ifelse(serk$election==2020 & grepl("ANO|ODS|STAN|KSČM|Piráti|KDU-ČSL|TOP 09|SPD|ČSSD", serk$partiesName),1,serk$parl_parties)
serk$parl_parties <- ifelse(serk$election==2022 & grepl("ANO|ODS|STAN|Piráti|KDU-ČSL|TOP 09|SPD", serk$partiesName),1,serk$parl_parties)
serk$parl_parties <- ifelse(serk$election==2024 & grepl("ANO|ODS|STAN|Piráti|KDU-ČSL|TOP 09|SPD", serk$partiesName),1,serk$parl_parties)
serk$government <- 0 # government parties
serk$government <- ifelse(serk$election==2018 & grepl("ANO|ČSSD|KSČM", serk$partiesName),1,serk$government)
serk$government <- ifelse(serk$election==2020 & grepl("ANO|ČSSD|KSČM", serk$partiesName),1,serk$government)
serk$government <- ifelse(serk$election==2022 & grepl("ODS|TOP|KDU|STAN|SLK|Piráti", serk$partiesName),1,serk$government)
serk$government <- ifelse(serk$election==2024 & grepl("ODS|TOP|KDU|STAN|SLK|Piráti", serk$partiesName),1,serk$government)
serk <- serk %>% # effective number of candidates
  group_by(election, constituency) %>%
  mutate(
    p = PROC_K1 / sum(PROC_K1, na.rm = TRUE),
    N_eff = 1 / sum(p^2, na.rm = TRUE),
    n_candidates = n(),
    comp_ind = N_eff / n_candidates
  ) %>%
  ungroup()
serk <- serk %>% # concentration - sum of percentages of the first two candidates
  group_by(election, constituency) %>%
  arrange(desc(PROC_K1), .by_group = TRUE) %>%
  mutate(
    concentration = sum(head(PROC_K1, 2), na.rm = TRUE)
  ) %>%
  ungroup()
library(dplyr) 
library(stringr) 
serk <- serk %>%
  group_by(election, constituency) %>%
  arrange(desc(PROC_K1), .by_group = TRUE) %>%
  mutate(
    first  = PROC_K1[1],
    second = PROC_K1[2],
    third = PROC_K1[3],
    closeness_2_1 = (second / first) * 100,
    closeness_3_2 = (third / second) * 100
  ) %>%
  ungroup()
serk <- serk %>% # government and opposition candidate in the second round
  group_by(election, constituency) %>%
  mutate(
    blocks = ifelse(
      sum(PROC_K2 > 0, na.rm = TRUE) == 2 &
        sum(parl_parties[PROC_K2 > 0] == 1, na.rm = TRUE) == 2 &
        length(unique(government[PROC_K2 > 0])) == 2,
      1, 0
    )
  ) %>%
  ungroup()
serk <- merge(serk,set5[,c("constituency","election","TURNOUT")], by = c("constituency","election")) # turnout

# variables distribution
hist(serk$total_spendings,breaks = 30)
hist(serk$total_donations,breaks = 30)
hist(serk$distance)
hist(serk$candidates)
hist(serk$concentration)
hist(serk$closeness_2_1)
hist(serk$closeness_3_2)
hist(serk$TURNOUT)

# poisson vs negative binomial
var(serk$total_spendings)
mean(serk$total_spendings)
var(serk$total_donations)
mean(serk$total_donations)

# multivariate models
reg_ind_spend <- glm.nb(total_spendings ~ independent + parl_parties + government + incumbent.y + female + VEK + university + 
                          candidates + comp_ind + closeness_2_1 + blocks + distance + TURNOUT, 
                      data = serk)
vif(reg_ind_spend)
summary(reg_ind_spend)
reg_ind_don <- glm.nb(total_donations ~ independent + parl_parties + government + incumbent.y + female + VEK + university + 
                        candidates + comp_ind + closeness_2_1 + blocks + distance + TURNOUT,
                      data = serk)
vif(reg_ind_don)
summary(reg_ind_don)

stargazer(reg_ind_spend, reg_ind_don, type = "html", out = "export/models_nb.html")

# constituency level analysis
data_con <- spen_candidates %>%
  group_by(constituency, election) %>%
  summarise(total_spendings = sum(total_spendings, na.rm = TRUE), .groups = "drop")
data_con <- merge(data_con,margins,by=c("constituency","election"), all.x = T)
data_con <- merge(data_con,candidates,by=c("constituency","election"), all = T)
data_con <- merge(data_con,map[,c("constituency","distance")],by=c("constituency"), all = T)
data_con$spending_rel <- data_con$total_spendings/data_con$candidates
data_con <- merge(data_con,serk[,c("constituency","election","comp_ind","closeness_2_1","closeness_3_2","blocks")],
                  by = c("constituency","election"), all.x = T)
data_con <- distinct(data_con)

serk_summary <- serk %>%
  group_by(constituency, election) %>%
  summarise(
    pct_independent  = mean(independent, na.rm = TRUE),
    pct_parliament   = mean(parl_parties, na.rm = TRUE),
    pct_university   = mean(university, na.rm = TRUE),
    avg_age          = mean(VEK, na.rm = TRUE), 
    pct_female       = mean(female, na.rm = TRUE),
    pct_government   = mean(government, na.rm = TRUE),
    total_donations  = sum(total_donations, na.rm = TRUE),
    .groups = "drop")
data_con <- data_con %>%
  left_join(serk_summary, by = c("constituency", "election"))
data_con$donations_rel <- data_con$total_donations/data_con$candidates

hist(data_con$spending_rel)
hist(data_con$donations_rel)
hist(data_con$distance)
hist(data_con$closeness_2_1)
hist(data_con$closeness_3_2)

reg_con_spend <- lm(spending_rel ~ distance + incumbent + closeness_2_1 + comp_ind + blocks, 
              data = data_con)
vif(reg_con_spend)
summary(reg_con_spend)

reg_con_don <- lm(donations_rel ~ distance + incumbent + closeness_2_1 + comp_ind + blocks, 
              data = data_con)
vif(reg_con_don)
summary(reg_con_don)

#############################
########## PLOTS ############
#############################

# individual spendings
pred_ind_spend_cand <- ggpredict(reg_ind_spend, ci_level = 0.95, terms = "candidates[2:16]",
                                      condition = list(incumbent.y = 1, female = FALSE, university = 1, independent = 0, parl_parties = 1, government = 0, blocks = 0))
ggplot(pred_ind_spend_cand, aes(x = x, y = predicted)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, fill = "grey50") +
  geom_line(size = 1, color = "black") +
  labs(title = "", x = "Candidates", y = "Predicted Spending (M)") +
  theme_minimal(base_size = 12) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(labels = label_number(scale = 1e-6, suffix = ""))
ggsave("export/cand_spend.jpeg", width = 10, height = 5, dpi = 300)

pred_ind_spend_close <- ggpredict(reg_ind_spend, ci_level = 0.95, terms = "closeness_2_1[0:100]",
                                 condition = list(incumbent.y = 1, female = FALSE, university = 1, independent = 0, parl_parties = 1, government = 0, blocks = 0))
ggplot(pred_ind_spend_close, aes(x = x, y = predicted)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, fill = "grey50") +
  geom_line(size = 1, color = "black") +
  labs(title = "", x = "2nd/1st Candidate 1st Round Gain (%)", y = "Predicted Spending (M)") +
  theme_minimal(base_size = 12) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(labels = label_number(scale = 1e-6, suffix = "")) + 
  theme(panel.grid.minor.x = element_blank())
ggsave("export/closeness_spend.jpeg", width = 10, height = 5, dpi = 300)

# combination of the two previous figures
p1 <- ggplot(pred_ind_spend_cand, aes(x = x, y = predicted)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, fill = "grey50") +
  geom_line(size = 1, color = "black") +
  labs(title = "a)", x = "Candidates", y = "Predicted Spending (M)") +
  theme_minimal(base_size = 12) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(
    limits = c(300000, 1800000),
    breaks = seq(0, 1800000, 200000),
    labels = scales::label_number(scale = 1e-6, suffix = "")
  )
p2 <- ggplot(pred_ind_spend_close, aes(x = x, y = predicted)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, fill = "grey50") +
  geom_line(size = 1, color = "black") +
  labs(title = "b)", x = "2nd/1st Candidate 1st Round Gain (%)", y = NULL) +
  theme_minimal(base_size = 12) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(
    limits = c(300000, 1800000),
    breaks = seq(0, 1800000, 200000),
    labels = scales::label_number(scale = 1e-6, suffix = "")
  ) +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.minor.x = element_blank()
  )
combined <- p1 + p2 + plot_layout(ncol = 2, guides = "collect") & 
  theme(plot.margin = margin())
ggsave("export/predicted_spending.jpeg", width = 10, height = 5, dpi = 300)

pred_ind_spend_pp <- ggpredict(reg_ind_spend, ci_level = 0.95, terms = "parl_parties[0:1]",
                               condition = list(incumbent.y = 1, female = FALSE, university = 1, independent = 0, government = 0, blocks = 0))
pred_ind_spend_pp
pred_ind_spend_ind <- ggpredict(reg_ind_spend, ci_level = 0.95, terms = "independent[0:1]",
                             condition = list(incumbent.y = 1, female = FALSE, university = 1, parl_parties = 1, government = 0, blocks = 0))
pred_ind_spend_ind
pred_ind_spend_uni <- ggpredict(reg_ind_spend, ci_level = 0.95, terms = "university[0:1]",
                              condition = list(incumbent.y = 1, female = FALSE, independet = 0, parl_parties = 1, government = 0, blocks = 0))
pred_ind_spend_uni

# individual donations
pred_ind_don_ind <- ggpredict(reg_ind_don, ci_level = 0.95, terms = "independent[0:1]",
                                condition = list(incumbent.y = 1, female = FALSE, university = 1, parl_parties = 1, government = 0, blocks = 0))
pred_ind_don_ind

library(openxlsx)

# variables you want
vars <- c("total_spendings", "total_donations", "independent", "parl_parties", 
          "government", "incumbent.y", "female", "VEK", "university", 
          "candidates", "comp_ind", "closeness_2_1", "blocks", "distance", "TURNOUT")

# subset only available variables
data_sub <- serk[, vars[vars %in% names(serk)]]

# compute descriptive stats
desc_table <- data.frame(
  Variable = names(data_sub),
  n = sapply(data_sub, function(x) sum(!is.na(x))),
  min = sapply(data_sub, function(x) min(x, na.rm = TRUE)),
  max = sapply(data_sub, function(x) max(x, na.rm = TRUE)),
  mean = sapply(data_sub, function(x) mean(x, na.rm = TRUE)),
  sd = sapply(data_sub, function(x) sd(x, na.rm = TRUE))
)

# round for readability
desc_table[, -1] <- round(desc_table[, -1], 2)

# export to Excel
write.xlsx(desc_table, "export/descriptive_stats.xlsx")

# number of constituencies
table(ifelse(nchar(expenditures$constituency) <=2, "Case1", "Other"))
prop.table(table(ifelse(nchar(expenditures$constituency) <=2, "Case1", "Other"))) * 100
table(ifelse(nchar(contributions$constituency) <=2, "Case1", "Other"))
prop.table(table(ifelse(nchar(contributions$constituency) <=2, "Case1", "Other"))) * 100
table(ifelse(nchar(donations$constituency) <=2, "Case1", "Other"))
prop.table(table(ifelse(nchar(donations$constituency) <=2, "Case1", "Other"))) * 100

# number of donations
table(donations$election[is.na(donations$date)])/table(donations$election)

table(expenditures$election[is.na(expenditures$expenseDate)])/table(expenditures$election)
table(expenditures$election[is.na(expenditures$effective_date)])/table(expenditures$election)





